Workshop: overview

In this workshop we will show the capability of TCGAbiolinks and Moonlight, to integrate multi -omics data from different consortium and to reproduce the six immune subtypes from TCGA PanCancer and how features (Immune Subtypes, Oncogenic Processes, Driver Genes and Stemness) can be used by the end user to expand their understating of their own un-published data.

The workshop is organized in 4 subsections:

  1. Data retrieval from (TCGA, GTEx, GEO and IHEC)
  2. Immune Subtypes
  3. Oncognic Processes and Driver Genes
  4. Stemness scores

Data retrieval from (GDC, TCGA, GEO and IHEC)

Validation of Stemness signature in external dataset. Nazor et al., Cell Stem Cell 2012

# Firstly we have previously prepared the Gene Expression matrix (genes in rows , samples in columns) 
# for Nazor's dataset using Moonlight and getGEO's function.

TCGA_mRNA_StemScoreTable <- TCGAanalyze_Stemness(stemSig = PCBC_stemSig,
                                          dataGE = NazorMatrix)



tab_mRNASi <- TCGA_mRNA_StemScoreTable

tab_mRNASi_merged <- merge(x = tab_mRNASi,
                    y = dataNazor_samples,
                    by.x = "Sample",
                    by.y = "geo_accession")
require(ggplot2)
require(ggpubr)

colnames(tab_mRNASi_merged)[3] <- "mRNASi"

tab_mRNASi_merged[tab_mRNASi_merged$CellType %in% "embryonic stem cell","CellType"] <- "ES"
tab_mRNASi_merged[tab_mRNASi_merged$CellType %in% "induced pluripotent stem cell","CellType"] <- "IPS"
tab_mRNASi_merged[tab_mRNASi_merged$CellType %in% "parthenogentic embryonic stem cell","CellType"] <- "Parthenogentic"
tab_mRNASi_merged[tab_mRNASi_merged$CellType %in% "Somatic.Primary","CellType"] <- "Primary"
tab_mRNASi_merged[tab_mRNASi_merged$CellType %in% "Somatic.Tissue","CellType"] <- "Tissue"

tab_mRNASi_merged <- tab_mRNASi_merged[order(tab_mRNASi_merged$mRNASi, decreasing = FALSE),]
library(forcats)
p <- ggplot(data=tab_mRNASi_merged, aes(x=Sample, y=mRNASi, fill = CellType)) +
    geom_bar(stat="identity")+
    scale_colour_gradient2()+
    #coord_flip()+
   # ylim(0, 15)+
    #scale_x_discrete(limits = df1$Tissue)+
    theme_classic() +
    theme(axis.title.x=element_blank(),
                          axis.text.x=element_blank(),
                          axis.ticks.x=element_blank())+
    scale_fill_manual("legend", values = c("Primary" = "orange",
                                           "Parthenogentic" = "blue",
                                           "Tissue" = "darkgreen",
                                           "ES" = "red",
                                           "IPS" = "pink"))
p <- p + theme(legend.position="bottom")

p <- p + aes(x = fct_inorder(Sample))
ggsave(p, file = "Validation_mRNASi_Nazor.png", width = 6,height = 6)

The result from TCGAanalyze_Stemness validation in Nazor’s Dataset is shown below:

Validation of Stemness signature in external dataset. IHEC Author et al., Cell 2016

# stemness score with IHEC samples
load("./IHEC_genes_matrix.Rdata")
load("./IHEC_Sample_names.rdata")
IHECMatrix <- genes_matrix
require(stringr)
IHEC_annot <- sample_names
colnames(IHEC_annot) <- c("Diffname_short","X1")
IHEC_annot <- as.data.frame(IHEC_annot)

library("biomaRt")
ensemblsIDS <- str_split_fixed(rownames(IHECMatrix),"\\.",2)[,1]
rownames(IHECMatrix)<- ensemblsIDS

IHEC_annot_common <- IHEC_annot[IHEC_annot$X1 %in% colnames(IHECMatrix),]


# mapping Ensembl IDs to gene Symbol

require(TCGAbiolinks)
library(clusterProfiler)
library(org.Hs.eg.db)
gene.df <- bitr(ensemblsIDS, fromType = "ENSEMBL",
                toType = c( "ENTREZID", "SYMBOL"),
                OrgDb = org.Hs.eg.db)


IHECMatrix_sel <- IHECMatrix[gene.df$ENSEMBL,]
rownames(IHECMatrix_sel) <-gene.df$SYMBOL
IHECMatrix_sel <- as.data.frame(IHECMatrix_sel)

IHECMatrix_sel_filt <- TCGAanalyze_Filtering(tabDF = IHECMatrix_sel,
                                             method = "quantile",
                                             qnt.cut = 0.25)

TCGA_mRNA_StemIHEC <- TCGAanalyze_Stemness(stemSig = PCBC_stemSig,
                                                 dataGE = IHECMatrix_sel_filt)

colnames(TCGA_mRNA_StemIHEC)[3] <- "mRNASi"
colnames(IHEC_annot)[1] <- "CellType"
colnames(IHEC_annot)[2] <- "Sample"
TCGA_mRNA_StemIHEC_merge<- merge(x = TCGA_mRNA_StemIHEC,
                                 y = IHEC_annot,
                                 by.x = "Sample",
                                 by.y = "Sample")
tab_mRNASi_merged <- TCGA_mRNA_StemIHEC_merge

CellType <- table(tab_mRNASi_merged$CellType)



CellType_filt <- CellType[CellType > 5]

tab_mRNASi_merged_filt <- tab_mRNASi_merged[tab_mRNASi_merged$CellType %in% names(CellType_filt), ]
tab_mRNASi_merged <- tab_mRNASi_merged_filt


tab_mRNASi_merged <- tab_mRNASi_merged[order(tab_mRNASi_merged$mRNASi, decreasing = FALSE),]
require(ggplot2)
library(forcats)
p <- ggplot(data=tab_mRNASi_merged, aes(x=Sample, y=mRNASi, fill = CellType)) +
    geom_bar(stat="identity")+
    scale_colour_gradient2()+
    #coord_flip()+
    # ylim(0, 15)+
    #scale_x_discrete(limits = df1$Tissue)+
    theme_classic() +
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank())
p <- p + theme(legend.position="right")

p <- p + aes(x = fct_inorder(Sample))
p <- p + theme(text = element_text(size=14))
ggsave(p, file = "Validation_mRNASi_IHEC.png", width = 12,height = 6)

The result from TCGAanalyze_Stemness validation in IHEC’s Dataset is shown below:

Generating Stemness score for TCGA gene expression data (mRNASi). Malta et al. Cell, 2018

# Firstly we load Gene Expression matrix (genes in rows , samples in columns) 
# from a previous generated pancancer gene expression matrix
load("~/Dropbox (Personal)/Umiami/TCGAanalysis/Stemness/dataFilt_panCancer33new.Rdata")

curCancer <- "BRCA"
# We have previously generated a table with 33 cancer types barcodes and molecular subtypes.

dataSubt_PanCancer <- PanCancerAtlas_subtypes()

# Selecting TCGA breast cancer

dataSubt_curCancer <- dataSubt_PanCancer[dataSubt_PanCancer$cancer.type %in% "BRCA",]

commonSamples <- intersect(dataSubt_curCancer$pan.samplesID, colnames(dataFilt))

dataFilt_curCancer <- dataFilt[,commonSamples]
load("~/Dropbox (Personal)/Umiami/Github/TCGAbiolinks/data/PCBC_stemSig.rda")

TCGA_mRNA_StemScoreTable <- TCGAanalyze_Stemness(stemSig = PCBC_stemSig,
                                          dataGE = dataFilt_curCancer)

colnames(TCGA_mRNA_StemScoreTable)[1] <- "barcode"

sampleNT <- TCGAquery_SampleTypes(barcode = TCGA_mRNA_StemScoreTable$barcode,typesample = "NT")
sampleTP <- TCGAquery_SampleTypes(barcode = TCGA_mRNA_StemScoreTable$barcode,typesample = "TP")
#sampleTM <- TCGAquery_SampleTypes(barcode = TCGA_mRNA_StemScoreTable$barcode,typesample = "TM")
#sampleTAM <- TCGAquery_SampleTypes(barcode = TCGA_mRNA_StemScoreTable$barcode,typesample = "TAM")
#sampleTB <- TCGAquery_SampleTypes(barcode = TCGA_mRNA_StemScoreTable$barcode,typesample = "TB")

rownames(TCGA_mRNA_StemScoreTable) <- TCGA_mRNA_StemScoreTable$barcode

TCGA_mRNA_StemScoreTable[sampleNT,"SampleType"] <- "NT"
TCGA_mRNA_StemScoreTable[sampleTP,"SampleType"] <- "TP"
#TCGA_mRNA_StemScoreTable[sampleTM,"SampleType"] <- "TM"
#TCGA_mRNA_StemScoreTable[sampleTAM,"SampleType"] <- "TM"
#TCGA_mRNA_StemScoreTable[sampleTB,"SampleType"] <- "TB"

tab_mRNASi <- TCGA_mRNA_StemScoreTable

tab_mRNASi_merged <- merge(x = tab_mRNASi,
                    y = dataSubt_PanCancer,
                    by.x = "barcode",
                    by.y = "pan.samplesID")
require(ggplot2)
require(ggpubr)

colnames(tab_mRNASi_merged)[3] <- "mRNASi"



p<-ggplot(tab_mRNASi_merged, aes(x=SampleType, y=mRNASi, fill=SampleType))
p <- p + theme_classic()

p <- p +  theme(legend.position="none")
p <- p + rotate_x_text(45)
p <- p + geom_jitter(shape=16, position=position_jitter(0.2), color = "black")
p <- p + geom_boxplot(position=position_dodge(1),outlier.colour = NA)
p <- p + theme(text = element_text(size=20))
ggsave(p , filename = "TCGA_BRCA_mRNASi_TP_NT.png",width = 5,height = 6)


tab_mRNASi_merged_TP <- tab_mRNASi_merged[tab_mRNASi_merged$SampleType %in% "TP",]

p<-ggplot(tab_mRNASi_merged_TP, aes(x=Subtype_Selected, y=mRNASi, fill=Subtype_Selected))
p <- p + theme_classic()

p <- p +  theme(legend.position="none")
p <- p + rotate_x_text(45)
p <- p + geom_jitter(shape=16, position=position_jitter(0.2), color = "black")
p <- p + geom_boxplot(position=position_dodge(1),outlier.colour = NA)
p <- p + theme(text = element_text(size=20))
ggsave(p , filename = "TCGA_BRCA_mRNASi_subtypes.png",width = 5,height = 6)

The result from TCGAanalyze_Stemness is shown below:

The result from TCGAanalyze_Stemness with molecular subtypes is shown below:

Generating Stemness score for TCGA LGG and GBM gene expression data (mRNASi). Malta et al. Cell, 2018

# working with LGG and GBM
curCancer <- c("LGG","GBM")
# We have previously generated a table with 33 cancer types barcodes and molecular subtypes.
require(TCGAbiolinks)
dataSubt_PanCancer <- PanCancerAtlas_subtypes()

# Selecting TCGA breast cancer

dataSubt_curCancer <- dataSubt_PanCancer[dataSubt_PanCancer$cancer.type %in% curCancer,]

sampleCurCancer <- colnames(dataFilt)
sampleCurCancer <- sampleCurCancer[substr(sampleCurCancer,1,12) %in% dataSubt_curCancer$pan.samplesID]

dataFilt_curCancer <- dataFilt[,sampleCurCancer]
load("~/Dropbox (Personal)/Umiami/Github/TCGAbiolinks/data/PCBC_stemSig.rda")

TCGA_mRNA_StemScoreTable <- TCGAanalyze_Stemness(stemSig = PCBC_stemSig,
                                                 dataGE = dataFilt_curCancer)

colnames(TCGA_mRNA_StemScoreTable)[1] <- "barcode"


tab_mRNASi <- TCGA_mRNA_StemScoreTable
tab_mRNASi <- cbind(barcode12 = substr(tab_mRNASi$barcode,1,12),
                    tab_mRNASi)
tab_mRNASi$barcode12 <- as.character(tab_mRNASi$barcode12)


tab_mRNASi_merged <- merge(x = tab_mRNASi,
                           y = dataSubt_PanCancer,
                           by.x = "barcode12",
                           by.y = "pan.samplesID")
require(ggplot2)
require(ggpubr)

colnames(tab_mRNASi_merged)[4] <- "mRNASi"

p<-ggplot(tab_mRNASi_merged, aes(x=Subtype_Selected, y=mRNASi, fill=Subtype_Selected))
p <- p + theme_classic()
p <- p +  theme(legend.position="none")
p <- p + rotate_x_text(45)
p <- p + geom_jitter(shape=16, position=position_jitter(0.2), color = "black")
p <- p + geom_boxplot(position=position_dodge(1),outlier.colour = NA)
p <- p + theme(text = element_text(size=20))
ggsave(p , filename = "TCGA_LGG_GBM_mRNASi_subtypes.png",width = 5,height = 6)

The result from TCGAanalyze_Stemness for LGG and GBM is shown below:

Stemness index is associated with overall survival (Glioma example)

#working with survival glioma

tab_mRNASi_merged <- cbind(mRNASi_level = rep("mRNASi mod",nrow(tab_mRNASi_merged)),
                           tab_mRNASi_merged)

tabSi<- tab_mRNASi_merged
tabSi$mRNASi_level <- as.character(tabSi$mRNASi_level)

tabSi[tabSi$mRNASi < quantile(tabSi$mRNASi,1/3),"mRNASi_level"] <- "mRNASi Low"
tabSi[tabSi$mRNASi > quantile(tabSi$mRNASi,2/3),"mRNASi_level"] <- "mRNASi High"


require(TCGAbiolinks)

dataClin_LGG <- GDCquery_clinic(project = "TCGA-LGG",type = "clinical")
dataClin_GBM <- GDCquery_clinic(project = "TCGA-GBM",type = "clinical")

dataClin_LGG_GBM <- rbind(dataClin_LGG,dataClin_GBM)

dataClin_LGG_GBM <- dataClin_LGG_GBM[dataClin_LGG_GBM$submitter_id %in% tabSi$barcode12,]

dataClin_merged <- merge(x = dataClin_LGG_GBM,
                         y = tabSi,
                         by.x = "submitter_id",
                         by.y = "barcode12")




p <- TCGAanalyze_survival(dataClin_merged,
                          clusterCol = "mRNASi_level",
                          conf.int = FALSE,
                          main = "TCGA LGG GBM mRNASi", 
                          height = 10,
                          width=10,
                          risk.table = TRUE,
                          filename = NULL)

p <- p$plot

p <- p  + theme(legend.position = "bottom")

ggsave(p , filename = "TCGA_LGG_GBM_mRNASi_survival.png",width = 5,height = 6)

The result from the Stemness-survival association for LGG and GBM is shown below:

In this section we generate the Immune subtypes for TCGA and GEO tumors. Figure1C readapted from Thorsson et al., Immunity, 2018, to summarize features of six different immune subtypes.

KM survival analysis of TCGA KIRC samples with Immune Subtypes from Thorsson et al., Immunity, 2018

download.file(url = "https://ars.els-cdn.com/content/image/1-s2.0-S1074761318301213-mmc2.xlsx",
               destfile = "X1_s2_0_S1074761318301213_mmc2")
X1_s2_0_S1074761318301213_mmc2 <- read_excel("X1_s2_0_S1074761318301213_mmc2")
X1_s2_0_S1074761318301213_mmc2 <- as.data.frame(X1_s2_0_S1074761318301213_mmc2)

CancerType <- c("KIRC")

ImmuneSubtypes <- as.data.frame(X1_s2_0_S1074761318301213_mmc2)

ImmuneSubtypes <- ImmuneSubtypes[ImmuneSubtypes$`TCGA Study` %in% CancerType,]
ImmuneSubtypes <- ImmuneSubtypes[ImmuneSubtypes$`Immune Subtype`!="NA",]

dataClin_KIRC <- GDCquery_clinic(project = "TCGA-KIRC",type = "clinical")

dataClin_merged <- merge(x = dataClin_KIRC,
                         y = ImmuneSubtypes,
                         by.x = "submitter_id",
                         by.y = "TCGA Participant Barcode")


p <- TCGAanalyze_survival(dataClin_merged,
                          clusterCol = "Immune Subtype",
                          conf.int = FALSE,
                          main = "TCGA KIRC Immune Subtypes",
                          height = 10,
                          width=10,
                          risk.table = TRUE,
                          filename = NULL)

p <- p$plot

p <- p  + theme(legend.position = "right")

ggsave(p , filename = "TCGA_KIRC_ImmuneSubtypes_survival.png",width = 10,height = 6)

The result from the Immune subtypes -survival association for BRCA samples is shown below:

Immune Subtypes for GEO samples (KIRC example) using data from Jones et al.,Clin Cancer Res 2005 (16115910)

require(MoonlightR)
 
#Jones J, Otu H, Spentzos D, Kolia S et al. Gene signatures of progression and metastasis in renal cell cancer. 
# Clin Cancer Res 2005 Aug 15;11(16):5730-9. PMID: 16115910

dataGEO_KIRC<- getDataGEO(GEOobject = "GSE15641",
                        platform =  "GPL96")
require(SummarizedExperiment)

GSE15641 <- as.data.frame(exprs(dataGEO_KIRC))
GSE15641_non_norm <- cbind(ILMN = rownames(GSE15641),
                           IDmean = rowMeans(GSE15641),
                           GSE15641)

GSE15641_annot <- fData(dataGEO_KIRC)
GSE15641_annot <- as.data.frame(GSE15641_annot)
GSE15641_annot <- subset(GSE15641_annot,
                              select = c("ID","Gene.symbol"))


dataGEO_samples <- pData(dataGEO_KIRC)
dataGEO_samples <- as.data.frame(dataGEO_samples)
dataGEO_samples <- subset(dataGEO_samples,
                            select = c("geo_accession","source_name_ch1"))
colnames(dataGEO_samples)[2] <- "CellType"

dataGEO_samples <- dataGEO_samples[dataGEO_samples$CellType %in% "cancerous human kidney tissue, clear cell RCC",]

GSE15641_merge <- merge(x = GSE15641_annot,
                        y = GSE15641_non_norm,
                        by.x = "ID",
                        by.y = "ILMN")


GSE15641_merge <- GSE15641_merge[order(GSE15641_merge$IDmean,decreasing = TRUE),]
GSE15641_merge <- GSE15641_merge[!duplicated(GSE15641_merge$Gene.symbol),]
GSE15641_Matrix <- GSE15641_merge
rownames(GSE15641_Matrix) <- GSE15641_Matrix$Gene.symbol

GSE15641_Matrix <- GSE15641_Matrix[,dataGEO_samples$geo_accession]

dataImmuneSubtype <- TCGAanalyze_ImmuneSubtypes(ImmuneMW = ImmuneMW,
                                                dataGE = GSE15641_Matrix)

Oncogenic processes and driver genes using MoonlightR

To include some examples from http://bioconductor.org/packages/release/bioc/vignettes/MoonlightR/inst/doc/Moonlight.html

And some figures from https://www.sciencedirect.com/science/article/pii/S0092867418303131

Session Information


sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] grid      parallel  stats4    stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] png_0.1-7                   TCGAbiolinks_2.13.1        
##  [3] dplyr_0.8.1                 SummarizedExperiment_1.14.0
##  [5] DelayedArray_0.10.0         BiocParallel_1.18.0        
##  [7] matrixStats_0.54.0          Biobase_2.44.0             
##  [9] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [11] IRanges_2.18.0              S4Vectors_0.22.0           
## [13] BiocGenerics_0.30.0        
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_1.4-1            selectr_0.4-1              
##   [3] rjson_0.2.20                hwriter_1.3.2              
##   [5] circlize_0.4.6              XVector_0.24.0             
##   [7] GlobalOptions_0.1.0         clue_0.3-57                
##   [9] ggpubr_0.2                  matlab_1.0.2               
##  [11] ggrepel_0.8.1               bit64_0.9-7                
##  [13] AnnotationDbi_1.46.0        xml2_1.2.0                 
##  [15] codetools_0.2-16            splines_3.6.0              
##  [17] R.methodsS3_1.7.1           doParallel_1.0.14          
##  [19] DESeq_1.36.0                geneplotter_1.62.0         
##  [21] knitr_1.23                  jsonlite_1.6               
##  [23] Rsamtools_2.0.0             km.ci_0.5-2                
##  [25] broom_0.5.2                 annotate_1.62.0            
##  [27] cluster_2.0.8               R.oo_1.22.0                
##  [29] readr_1.3.1                 compiler_3.6.0             
##  [31] httr_1.4.0                  backports_1.1.4            
##  [33] assertthat_0.2.1            Matrix_1.2-17              
##  [35] lazyeval_0.2.2              limma_3.40.0               
##  [37] htmltools_0.3.6             prettyunits_1.0.2          
##  [39] tools_3.6.0                 gtable_0.3.0               
##  [41] glue_1.3.1                  GenomeInfoDbData_1.2.1     
##  [43] ggthemes_4.2.0              ShortRead_1.42.0           
##  [45] Rcpp_1.0.1                  Biostrings_2.52.0          
##  [47] nlme_3.1-139                rtracklayer_1.44.0         
##  [49] iterators_1.0.10            xfun_0.7                   
##  [51] stringr_1.4.0               rvest_0.3.4                
##  [53] XML_3.98-1.19               edgeR_3.26.1               
##  [55] zoo_1.8-5                   zlibbioc_1.30.0            
##  [57] scales_1.0.0                aroma.light_3.14.0         
##  [59] hms_0.4.2                   RColorBrewer_1.1-2         
##  [61] ComplexHeatmap_2.0.0        yaml_2.2.0                 
##  [63] memoise_1.1.0               gridExtra_2.3              
##  [65] KMsurv_0.1-5                ggplot2_3.1.1              
##  [67] downloader_0.4              biomaRt_2.40.0             
##  [69] latticeExtra_0.6-28         stringi_1.4.3              
##  [71] RSQLite_2.1.1               highr_0.8                  
##  [73] genefilter_1.66.0           foreach_1.4.4              
##  [75] GenomicFeatures_1.36.0      shape_1.4.4                
##  [77] rlang_0.3.4                 pkgconfig_2.0.2            
##  [79] bitops_1.0-6                evaluate_0.13              
##  [81] lattice_0.20-38             purrr_0.3.2                
##  [83] cmprsk_2.2-7                GenomicAlignments_1.20.0   
##  [85] bit_1.1-14                  tidyselect_0.2.5           
##  [87] plyr_1.8.4                  magrittr_1.5               
##  [89] R6_2.4.0                    generics_0.0.2             
##  [91] DBI_1.0.0                   mgcv_1.8-28                
##  [93] pillar_1.4.0                survival_2.44-1.1          
##  [95] RCurl_1.95-4.12             tibble_2.1.1               
##  [97] EDASeq_2.18.0               crayon_1.3.4               
##  [99] survMisc_0.5.5              rmarkdown_1.12             
## [101] GetoptLong_0.1.7            progress_1.2.2             
## [103] locfit_1.5-9.1              sva_3.32.0                 
## [105] data.table_1.12.2           blob_1.1.1                 
## [107] ConsensusClusterPlus_1.48.0 digest_0.6.18              
## [109] xtable_1.8-4                tidyr_0.8.3                
## [111] R.utils_2.8.0               munsell_0.5.0              
## [113] survminer_0.4.3